package com.tops.webmagic.processor;

import com.tops.webmagic.constants.ConstantsField;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 *  页面处理器
 * @author yiping_wang
 */
public class XmPageProcessor implements PageProcessor {

    /**
     * 抓取网站的相关配置，包括编码、抓取间隔、重试次数等
     */
    private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1000);

    /**
     * 核心：编写抽取逻辑
     */
    @Override
    public void process(Page page) {
        // 抽取页面文本数据
        Selectable selectable = page.getHtml().css(ConstantsField.PAGE_CSS_CONTENT);

        //处理图片
        List<String> pImgList = selectable.xpath(ConstantsField.XPATH_IMG).all();
        List<String> imgUrl = new ArrayList<>();
        if(pImgList.size()>0){
            Pattern compile = Pattern.compile(ConstantsField.REX_IMG_SRC);
            for (String img : pImgList) {
                Matcher matcher = compile.matcher(img);
                while (matcher.find()){
                    imgUrl.add(matcher.group(1));
                }
            }
        }
        if(imgUrl.size()>0){
            page.putField("imgList",imgUrl);
        }else {
            page.putField("imgList",null);
        }

        //对内容转换为StringBuilder
        String content = selectable.toString();
        StringBuilder stringBuilder = new StringBuilder(content);

        //处理超链接
        StringBuilder newString = dealLink(stringBuilder);

        //处理末尾
        int startIndex = newString.indexOf(ConstantsField.END_CONTENT);
        if(startIndex>0) {
            newString.delete(startIndex, stringBuilder.length());
            newString.append("</div>");
        }

        page.putField("content",newString.toString());
    }

    @Override
    public Site getSite() {
        return site;
    }

    /**
     * 处理超链接
     */
    private static StringBuilder dealLink(StringBuilder stringBuilder){
        StringBuilder newString = new StringBuilder(stringBuilder);
        int aIndex = newString.indexOf("<a href");
        while (aIndex != -1){
            int pStart = newString.lastIndexOf("<p>", aIndex);
            int pEnd = (newString.indexOf("</p>", aIndex) + 4);
            newString.delete(pStart,pEnd);
            aIndex = newString.indexOf("<a href");
        }
        return newString;
    }
}
